{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "da1eaf61",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import sys\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "255d7344-8fd6-4c79-9b0f-e5f58289fc8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"/Users/wenzel/Downloads/ITA_OSHA_Combined.csv\", low_memory=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a9314e89-f7f0-49cb-bf9f-4aeb0c076056",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>company_name</th>\n",
       "      <th>establishment_name</th>\n",
       "      <th>ein</th>\n",
       "      <th>street_address</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip_code</th>\n",
       "      <th>naics_code</th>\n",
       "      <th>industry_description</th>\n",
       "      <th>...</th>\n",
       "      <th>total_hearing_loss</th>\n",
       "      <th>total_other_illnesses</th>\n",
       "      <th>establishment_id</th>\n",
       "      <th>establishment_type</th>\n",
       "      <th>size</th>\n",
       "      <th>year_filing_for</th>\n",
       "      <th>created_timestamp</th>\n",
       "      <th>change_reason</th>\n",
       "      <th>source</th>\n",
       "      <th>delete</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>McKamish Inc.</td>\n",
       "      <td>McKamish Inc.</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50 55th Street</td>\n",
       "      <td>Pittsburgh</td>\n",
       "      <td>PA</td>\n",
       "      <td>15201.0</td>\n",
       "      <td>238220</td>\n",
       "      <td>Heating, ventilation and air-conditioning (HVA...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>41920</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2016</td>\n",
       "      <td>8/1/2017 6:12:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2016.csv</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>The Talaria Company LLC</td>\n",
       "      <td>The Hinckley Company</td>\n",
       "      <td>NaN</td>\n",
       "      <td>40 Industrial Way</td>\n",
       "      <td>Trenton</td>\n",
       "      <td>ME</td>\n",
       "      <td>4605.0</td>\n",
       "      <td>336612</td>\n",
       "      <td>Pleasure boats manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>41922</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2016</td>\n",
       "      <td>8/1/2017 6:23:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2016.csv</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6</td>\n",
       "      <td>Williamsburg Manufacturing</td>\n",
       "      <td>Williamsburg Manufacturing</td>\n",
       "      <td>NaN</td>\n",
       "      <td>408 Maplewood Ave</td>\n",
       "      <td>Williamsburg</td>\n",
       "      <td>IA</td>\n",
       "      <td>52361.0</td>\n",
       "      <td>336370</td>\n",
       "      <td>Motor vehicle metal parts stamping</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>41923</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2016</td>\n",
       "      <td>8/1/2017 6:27:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2016.csv</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7</td>\n",
       "      <td>The Talaria Company LLC</td>\n",
       "      <td>Morris Yachts LLC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27 Ramp Road</td>\n",
       "      <td>Trenton</td>\n",
       "      <td>ME</td>\n",
       "      <td>4605.0</td>\n",
       "      <td>336612</td>\n",
       "      <td>Pleasure boats manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>41925</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2016</td>\n",
       "      <td>8/1/2017 6:36:00</td>\n",
       "      <td>Hit wrong hyperlink</td>\n",
       "      <td>ITA Data CY 2016.csv</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8</td>\n",
       "      <td>The Talaria Company LLC</td>\n",
       "      <td>Hunt Yachts LLC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1909 Alden Landing</td>\n",
       "      <td>Portsmouth</td>\n",
       "      <td>RI</td>\n",
       "      <td>2871.0</td>\n",
       "      <td>336612</td>\n",
       "      <td>Pleasure boats manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>41926</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2016</td>\n",
       "      <td>8/1/2017 6:35:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2016.csv</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1635159</th>\n",
       "      <td>1724282</td>\n",
       "      <td>Thombert Inc</td>\n",
       "      <td>Thombert</td>\n",
       "      <td>420670188.0</td>\n",
       "      <td>220 Industrial Drive</td>\n",
       "      <td>Brooklyn</td>\n",
       "      <td>IA</td>\n",
       "      <td>52211.0</td>\n",
       "      <td>325211</td>\n",
       "      <td>Acetal resins manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>893662</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/14/2022 21:39:34</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1635160</th>\n",
       "      <td>1724283</td>\n",
       "      <td>ADF International</td>\n",
       "      <td>ADF International</td>\n",
       "      <td>650370294.0</td>\n",
       "      <td>1900 Great Bear Ave.</td>\n",
       "      <td>Great Falls</td>\n",
       "      <td>MT</td>\n",
       "      <td>59404.0</td>\n",
       "      <td>332312</td>\n",
       "      <td>Structural steel, fabricated, manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>893661</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/14/2022 21:40:52</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1635161</th>\n",
       "      <td>1724284</td>\n",
       "      <td>TDK Corporation of America</td>\n",
       "      <td>TCA - San Jose</td>\n",
       "      <td>952893461.0</td>\n",
       "      <td>1745 Technology Drive, Suite 200</td>\n",
       "      <td>San Jose</td>\n",
       "      <td>CA</td>\n",
       "      <td>95110.0</td>\n",
       "      <td>423690</td>\n",
       "      <td>Capacitors, electronic, merchant wholesalers</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>502665</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/14/2022 21:58:15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1635162</th>\n",
       "      <td>1724285</td>\n",
       "      <td>TDK Corporation of America</td>\n",
       "      <td>TCA Lincolnshire</td>\n",
       "      <td>952893461.0</td>\n",
       "      <td>475 Half Day Road</td>\n",
       "      <td>Lincolnshire</td>\n",
       "      <td>IL</td>\n",
       "      <td>60069.0</td>\n",
       "      <td>423690</td>\n",
       "      <td>Capacitors, electronic, merchant wholesalers</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>502663</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/14/2022 21:58:15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1635163</th>\n",
       "      <td>1724286</td>\n",
       "      <td>Morello Concrete Construction Inc.</td>\n",
       "      <td>Morello Concrete Construction Inc.</td>\n",
       "      <td>330449044.0</td>\n",
       "      <td>8534 Hubbles Lane</td>\n",
       "      <td>Santee</td>\n",
       "      <td>CA</td>\n",
       "      <td>92071.0</td>\n",
       "      <td>238110</td>\n",
       "      <td>Foundation, building, poured concrete, contra...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>427167</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/14/2022 22:01:33</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1635164 rows × 33 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              id                        company_name  \\\n",
       "0              4                       McKamish Inc.   \n",
       "1              5             The Talaria Company LLC   \n",
       "2              6          Williamsburg Manufacturing   \n",
       "3              7             The Talaria Company LLC   \n",
       "4              8             The Talaria Company LLC   \n",
       "...          ...                                 ...   \n",
       "1635159  1724282                        Thombert Inc   \n",
       "1635160  1724283                   ADF International   \n",
       "1635161  1724284          TDK Corporation of America   \n",
       "1635162  1724285          TDK Corporation of America   \n",
       "1635163  1724286  Morello Concrete Construction Inc.   \n",
       "\n",
       "                         establishment_name          ein  \\\n",
       "0                             McKamish Inc.          NaN   \n",
       "1                      The Hinckley Company          NaN   \n",
       "2                Williamsburg Manufacturing          NaN   \n",
       "3                         Morris Yachts LLC          NaN   \n",
       "4                           Hunt Yachts LLC          NaN   \n",
       "...                                     ...          ...   \n",
       "1635159                            Thombert  420670188.0   \n",
       "1635160                   ADF International  650370294.0   \n",
       "1635161                      TCA - San Jose  952893461.0   \n",
       "1635162                    TCA Lincolnshire  952893461.0   \n",
       "1635163  Morello Concrete Construction Inc.  330449044.0   \n",
       "\n",
       "                           street_address          city state  zip_code  \\\n",
       "0                          50 55th Street    Pittsburgh    PA   15201.0   \n",
       "1                       40 Industrial Way       Trenton    ME    4605.0   \n",
       "2                       408 Maplewood Ave  Williamsburg    IA   52361.0   \n",
       "3                            27 Ramp Road       Trenton    ME    4605.0   \n",
       "4                      1909 Alden Landing    Portsmouth    RI    2871.0   \n",
       "...                                   ...           ...   ...       ...   \n",
       "1635159              220 Industrial Drive      Brooklyn    IA   52211.0   \n",
       "1635160              1900 Great Bear Ave.   Great Falls    MT   59404.0   \n",
       "1635161  1745 Technology Drive, Suite 200      San Jose    CA   95110.0   \n",
       "1635162                 475 Half Day Road  Lincolnshire    IL   60069.0   \n",
       "1635163                 8534 Hubbles Lane        Santee    CA   92071.0   \n",
       "\n",
       "         naics_code                               industry_description  ...  \\\n",
       "0            238220  Heating, ventilation and air-conditioning (HVA...  ...   \n",
       "1            336612                       Pleasure boats manufacturing  ...   \n",
       "2            336370                 Motor vehicle metal parts stamping  ...   \n",
       "3            336612                       Pleasure boats manufacturing  ...   \n",
       "4            336612                       Pleasure boats manufacturing  ...   \n",
       "...             ...                                                ...  ...   \n",
       "1635159      325211                       Acetal resins manufacturing   ...   \n",
       "1635160      332312       Structural steel, fabricated, manufacturing   ...   \n",
       "1635161      423690      Capacitors, electronic, merchant wholesalers   ...   \n",
       "1635162      423690      Capacitors, electronic, merchant wholesalers   ...   \n",
       "1635163      238110   Foundation, building, poured concrete, contra...  ...   \n",
       "\n",
       "         total_hearing_loss  total_other_illnesses  establishment_id  \\\n",
       "0                         0                      0             41920   \n",
       "1                         0                      0             41922   \n",
       "2                         0                      0             41923   \n",
       "3                         0                      0             41925   \n",
       "4                         0                      0             41926   \n",
       "...                     ...                    ...               ...   \n",
       "1635159                   0                      0            893662   \n",
       "1635160                   0                      0            893661   \n",
       "1635161                   0                      0            502665   \n",
       "1635162                   0                      0            502663   \n",
       "1635163                   0                      0            427167   \n",
       "\n",
       "         establishment_type  size  year_filing_for   created_timestamp  \\\n",
       "0                       1.0     3             2016    8/1/2017 6:12:00   \n",
       "1                       1.0     3             2016    8/1/2017 6:23:00   \n",
       "2                       1.0     3             2016    8/1/2017 6:27:00   \n",
       "3                       1.0     2             2016    8/1/2017 6:36:00   \n",
       "4                       1.0     2             2016    8/1/2017 6:35:00   \n",
       "...                     ...   ...              ...                 ...   \n",
       "1635159                 1.0     2             2021  3/14/2022 21:39:34   \n",
       "1635160                 1.0     2             2021  3/14/2022 21:40:52   \n",
       "1635161                 1.0     2             2021  3/14/2022 21:58:15   \n",
       "1635162                 1.0     2             2021  3/14/2022 21:58:15   \n",
       "1635163                 1.0     2             2021  3/14/2022 22:01:33   \n",
       "\n",
       "               change_reason  \\\n",
       "0                        NaN   \n",
       "1                        NaN   \n",
       "2                        NaN   \n",
       "3        Hit wrong hyperlink   \n",
       "4                        NaN   \n",
       "...                      ...   \n",
       "1635159                  NaN   \n",
       "1635160                  NaN   \n",
       "1635161                  NaN   \n",
       "1635162                  NaN   \n",
       "1635163                  NaN   \n",
       "\n",
       "                                                   source  delete  \n",
       "0                                    ITA Data CY 2016.csv     NaN  \n",
       "1                                    ITA Data CY 2016.csv     NaN  \n",
       "2                                    ITA Data CY 2016.csv     NaN  \n",
       "3                                    ITA Data CY 2016.csv     NaN  \n",
       "4                                    ITA Data CY 2016.csv     NaN  \n",
       "...                                                   ...     ...  \n",
       "1635159  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv     0.0  \n",
       "1635160  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv     0.0  \n",
       "1635161  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv     0.0  \n",
       "1635162  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv     0.0  \n",
       "1635163  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv     0.0  \n",
       "\n",
       "[1635164 rows x 33 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "df6ebe9a-695d-45b9-9fb8-b961ad0b62ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 批量字符替换解决公司名称中含有逗号的问题\n",
    "f=open('/Users/wenzel/Downloads/ITA_OSHA_Combined.csv','r')\n",
    "alllines=f.readlines()\n",
    "f.close()\n",
    "f=open('/Users/wenzel/Downloads/ITA_OSHA_Combined.csv','w+')\n",
    "for eachline in alllines:\n",
    "    eachline=re.sub(', Inc\\.',' Inc.',eachline) #这里不转译的话'.'就被当作正则表达式的任意字符了\n",
    "    eachline=re.sub(', LLC',' LLC',eachline)\n",
    "    f.writelines(eachline)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "137b53d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 策略一:清除所有含空值，即不完全的数据行\n",
    "df.drop('delete',axis=1, inplace=True)\n",
    "df.dropna(inplace = True)\n",
    "\n",
    "# 策略二：均值或中位数取代空值的思想\n",
    "\n",
    "# df = pd.read_csv('property-data.csv')\n",
    "\n",
    "# x = df[\"ST_NUM\"].median()\n",
    "\n",
    "# df[\"ST_NUM\"].fillna(x, inplace = True)\n",
    "\n",
    "# print(df.to_string())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2a09fb6f-1449-4a2d-b500-f85c20a261c7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>company_name</th>\n",
       "      <th>establishment_name</th>\n",
       "      <th>ein</th>\n",
       "      <th>street_address</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip_code</th>\n",
       "      <th>naics_code</th>\n",
       "      <th>industry_description</th>\n",
       "      <th>...</th>\n",
       "      <th>total_skin_disorders</th>\n",
       "      <th>total_hearing_loss</th>\n",
       "      <th>total_other_illnesses</th>\n",
       "      <th>establishment_id</th>\n",
       "      <th>establishment_type</th>\n",
       "      <th>size</th>\n",
       "      <th>year_filing_for</th>\n",
       "      <th>created_timestamp</th>\n",
       "      <th>change_reason</th>\n",
       "      <th>source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>215292</th>\n",
       "      <td>475886</td>\n",
       "      <td>Sion Power</td>\n",
       "      <td>Sion Power Tucson Facility</td>\n",
       "      <td>112955458.0</td>\n",
       "      <td>2900 E. Elvira Road</td>\n",
       "      <td>Tucson</td>\n",
       "      <td>AZ</td>\n",
       "      <td>85756.0</td>\n",
       "      <td>335911</td>\n",
       "      <td>Batteries, rechargeable, manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>407479</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/2/2019 19:47:19</td>\n",
       "      <td>I have recently taken over this position for m...</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>215832</th>\n",
       "      <td>476426</td>\n",
       "      <td>Cardiology of Southern California</td>\n",
       "      <td>Cardiology of Southern California</td>\n",
       "      <td>208275855.0</td>\n",
       "      <td>655 Euclid Ave, Suite 304</td>\n",
       "      <td>National City</td>\n",
       "      <td>CA</td>\n",
       "      <td>91950.0</td>\n",
       "      <td>621111</td>\n",
       "      <td>MDs' (medical doctors, except mental health) ...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>407869</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/3/2019 20:38:52</td>\n",
       "      <td>Incorrect Average number of employees, said 2 ...</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>217693</th>\n",
       "      <td>478674</td>\n",
       "      <td>Chicken Hawk Transport</td>\n",
       "      <td>Chicken Hawk Transport</td>\n",
       "      <td>820516501.0</td>\n",
       "      <td>235 London Drive</td>\n",
       "      <td>Sparks</td>\n",
       "      <td>NV</td>\n",
       "      <td>89434.0</td>\n",
       "      <td>484121</td>\n",
       "      <td>General freight trucking, long-distance, truc...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>408826</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/8/2019 18:48:31</td>\n",
       "      <td>Correction on case classification</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>221546</th>\n",
       "      <td>482527</td>\n",
       "      <td>Scougal Rubber Corporation</td>\n",
       "      <td>Scougal Rubber Corporation</td>\n",
       "      <td>900907093.0</td>\n",
       "      <td>885 Denmark Drive, Suite 103</td>\n",
       "      <td>McCarran</td>\n",
       "      <td>NV</td>\n",
       "      <td>89434.0</td>\n",
       "      <td>326291</td>\n",
       "      <td>Rubber goods, mechanical (i.e.,  extruded, la...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>78702</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/11/2019 22:43:22</td>\n",
       "      <td>Website indicates submission was necessary</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>228725</th>\n",
       "      <td>489709</td>\n",
       "      <td>NuWeld Inc.</td>\n",
       "      <td>NuWeld Inc.</td>\n",
       "      <td>232922276.0</td>\n",
       "      <td>2600 Reach Road</td>\n",
       "      <td>Williamsport</td>\n",
       "      <td>PA</td>\n",
       "      <td>17701.0</td>\n",
       "      <td>332996</td>\n",
       "      <td>Fabricated pipe and pipe fittings made from p...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>284571</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/18/2019 14:41:02</td>\n",
       "      <td>change in hours</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634472</th>\n",
       "      <td>1723592</td>\n",
       "      <td>Century Linen Service, Inc</td>\n",
       "      <td>Century Linen Service, Inc</td>\n",
       "      <td>205200067.0</td>\n",
       "      <td>542 N Perry Street</td>\n",
       "      <td>Johnstown</td>\n",
       "      <td>NY</td>\n",
       "      <td>12095.0</td>\n",
       "      <td>812331</td>\n",
       "      <td>Laundries, linen and uniform supply</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>626501</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 18:56:29</td>\n",
       "      <td>The original data was used from 2020 not 2021 ...</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634474</th>\n",
       "      <td>1723594</td>\n",
       "      <td>CENTURY LINEN&amp; UNIFORM INC</td>\n",
       "      <td>SYRACUSE</td>\n",
       "      <td>141009105.0</td>\n",
       "      <td>320 W TAYLOR STREET</td>\n",
       "      <td>SYRACUSE</td>\n",
       "      <td>NY</td>\n",
       "      <td>13507.0</td>\n",
       "      <td>812331</td>\n",
       "      <td>Laundries, linen and uniform supply</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>629942</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 18:55:20</td>\n",
       "      <td>The original data was used from 2020 not 2021 ...</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634478</th>\n",
       "      <td>1723598</td>\n",
       "      <td>APX OPERATING COMPANY LLC</td>\n",
       "      <td>Boomers Livermore</td>\n",
       "      <td>850850598.0</td>\n",
       "      <td>400 Kitty Hawk Road</td>\n",
       "      <td>Livermore</td>\n",
       "      <td>CA</td>\n",
       "      <td>94551.0</td>\n",
       "      <td>713110</td>\n",
       "      <td>713110 Amusement and Theme Parks</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>741179</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 19:11:25</td>\n",
       "      <td>missing days away from work info - corrected</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634479</th>\n",
       "      <td>1723599</td>\n",
       "      <td>APX OPERATING COMPANY LLC</td>\n",
       "      <td>Boomers Boca Raton</td>\n",
       "      <td>850850598.0</td>\n",
       "      <td>3100 Airport Road</td>\n",
       "      <td>Boca Raton</td>\n",
       "      <td>FL</td>\n",
       "      <td>33431.0</td>\n",
       "      <td>713110</td>\n",
       "      <td>713110 Amusement and Theme Parks</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>741180</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 19:24:00</td>\n",
       "      <td>missing days away from work info - corrected</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634481</th>\n",
       "      <td>1723601</td>\n",
       "      <td>APX OPERATING COMPANY LLC</td>\n",
       "      <td>Boomers Irvine</td>\n",
       "      <td>850850598.0</td>\n",
       "      <td>3405 Michelson Drive</td>\n",
       "      <td>Irvine</td>\n",
       "      <td>CA</td>\n",
       "      <td>92612.0</td>\n",
       "      <td>713110</td>\n",
       "      <td>713110 Amusement and Theme Parks</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>741182</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 19:11:50</td>\n",
       "      <td>missing days away from work info - corrected</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>24531 rows × 32 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              id                       company_name  \\\n",
       "215292    475886                         Sion Power   \n",
       "215832    476426  Cardiology of Southern California   \n",
       "217693    478674             Chicken Hawk Transport   \n",
       "221546    482527         Scougal Rubber Corporation   \n",
       "228725    489709                        NuWeld Inc.   \n",
       "...          ...                                ...   \n",
       "1634472  1723592         Century Linen Service, Inc   \n",
       "1634474  1723594         CENTURY LINEN& UNIFORM INC   \n",
       "1634478  1723598          APX OPERATING COMPANY LLC   \n",
       "1634479  1723599          APX OPERATING COMPANY LLC   \n",
       "1634481  1723601          APX OPERATING COMPANY LLC   \n",
       "\n",
       "                        establishment_name          ein  \\\n",
       "215292          Sion Power Tucson Facility  112955458.0   \n",
       "215832   Cardiology of Southern California  208275855.0   \n",
       "217693              Chicken Hawk Transport  820516501.0   \n",
       "221546          Scougal Rubber Corporation  900907093.0   \n",
       "228725                         NuWeld Inc.  232922276.0   \n",
       "...                                    ...          ...   \n",
       "1634472         Century Linen Service, Inc  205200067.0   \n",
       "1634474                           SYRACUSE  141009105.0   \n",
       "1634478                  Boomers Livermore  850850598.0   \n",
       "1634479                 Boomers Boca Raton  850850598.0   \n",
       "1634481                     Boomers Irvine  850850598.0   \n",
       "\n",
       "                       street_address           city state  zip_code  \\\n",
       "215292            2900 E. Elvira Road         Tucson    AZ   85756.0   \n",
       "215832      655 Euclid Ave, Suite 304  National City    CA   91950.0   \n",
       "217693               235 London Drive         Sparks    NV   89434.0   \n",
       "221546   885 Denmark Drive, Suite 103       McCarran    NV   89434.0   \n",
       "228725                2600 Reach Road   Williamsport    PA   17701.0   \n",
       "...                               ...            ...   ...       ...   \n",
       "1634472            542 N Perry Street      Johnstown    NY   12095.0   \n",
       "1634474           320 W TAYLOR STREET       SYRACUSE    NY   13507.0   \n",
       "1634478           400 Kitty Hawk Road      Livermore    CA   94551.0   \n",
       "1634479             3100 Airport Road     Boca Raton    FL   33431.0   \n",
       "1634481          3405 Michelson Drive         Irvine    CA   92612.0   \n",
       "\n",
       "         naics_code                               industry_description  ...  \\\n",
       "215292       335911            Batteries, rechargeable, manufacturing   ...   \n",
       "215832       621111   MDs' (medical doctors, except mental health) ...  ...   \n",
       "217693       484121   General freight trucking, long-distance, truc...  ...   \n",
       "221546       326291   Rubber goods, mechanical (i.e.,  extruded, la...  ...   \n",
       "228725       332996   Fabricated pipe and pipe fittings made from p...  ...   \n",
       "...             ...                                                ...  ...   \n",
       "1634472      812331               Laundries, linen and uniform supply   ...   \n",
       "1634474      812331               Laundries, linen and uniform supply   ...   \n",
       "1634478      713110                   713110 Amusement and Theme Parks  ...   \n",
       "1634479      713110                   713110 Amusement and Theme Parks  ...   \n",
       "1634481      713110                   713110 Amusement and Theme Parks  ...   \n",
       "\n",
       "         total_skin_disorders  total_hearing_loss  total_other_illnesses  \\\n",
       "215292                      0                   0                      0   \n",
       "215832                      0                   0                      0   \n",
       "217693                      0                   0                      0   \n",
       "221546                      0                   0                      0   \n",
       "228725                      0                   0                      0   \n",
       "...                       ...                 ...                    ...   \n",
       "1634472                     0                   0                      0   \n",
       "1634474                     0                   0                      0   \n",
       "1634478                     0                   0                      0   \n",
       "1634479                     0                   0                      1   \n",
       "1634481                     0                   0                      0   \n",
       "\n",
       "         establishment_id  establishment_type  size  year_filing_for  \\\n",
       "215292             407479                 1.0     2             2018   \n",
       "215832             407869                 1.0     2             2018   \n",
       "217693             408826                 1.0     2             2018   \n",
       "221546              78702                 1.0     2             2018   \n",
       "228725             284571                 1.0     2             2018   \n",
       "...                   ...                 ...   ...              ...   \n",
       "1634472            626501                 1.0     2             2021   \n",
       "1634474            629942                 1.0     2             2021   \n",
       "1634478            741179                 1.0     2             2021   \n",
       "1634479            741180                 1.0     2             2021   \n",
       "1634481            741182                 1.0     2             2021   \n",
       "\n",
       "          created_timestamp  \\\n",
       "215292    1/2/2019 19:47:19   \n",
       "215832    1/3/2019 20:38:52   \n",
       "217693    1/8/2019 18:48:31   \n",
       "221546   1/11/2019 22:43:22   \n",
       "228725   1/18/2019 14:41:02   \n",
       "...                     ...   \n",
       "1634472  3/12/2022 18:56:29   \n",
       "1634474  3/12/2022 18:55:20   \n",
       "1634478  3/12/2022 19:11:25   \n",
       "1634479  3/12/2022 19:24:00   \n",
       "1634481  3/12/2022 19:11:50   \n",
       "\n",
       "                                             change_reason  \\\n",
       "215292   I have recently taken over this position for m...   \n",
       "215832   Incorrect Average number of employees, said 2 ...   \n",
       "217693                   Correction on case classification   \n",
       "221546          Website indicates submission was necessary   \n",
       "228725                                     change in hours   \n",
       "...                                                    ...   \n",
       "1634472  The original data was used from 2020 not 2021 ...   \n",
       "1634474  The original data was used from 2020 not 2021 ...   \n",
       "1634478       missing days away from work info - corrected   \n",
       "1634479       missing days away from work info - corrected   \n",
       "1634481       missing days away from work info - corrected   \n",
       "\n",
       "                                                   source  \n",
       "215292                               ITA Data CY 2018.csv  \n",
       "215832                               ITA Data CY 2018.csv  \n",
       "217693                               ITA Data CY 2018.csv  \n",
       "221546                               ITA Data CY 2018.csv  \n",
       "228725                               ITA Data CY 2018.csv  \n",
       "...                                                   ...  \n",
       "1634472  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634474  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634478  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634479  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634481  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "\n",
       "[24531 rows x 32 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "13ab2bd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 数据去重，删除重复数据\n",
    "df.drop_duplicates(inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b23c1747-d151-4bd3-8ff7-fcaee37da363",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>company_name</th>\n",
       "      <th>establishment_name</th>\n",
       "      <th>ein</th>\n",
       "      <th>street_address</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip_code</th>\n",
       "      <th>naics_code</th>\n",
       "      <th>industry_description</th>\n",
       "      <th>...</th>\n",
       "      <th>total_skin_disorders</th>\n",
       "      <th>total_hearing_loss</th>\n",
       "      <th>total_other_illnesses</th>\n",
       "      <th>establishment_id</th>\n",
       "      <th>establishment_type</th>\n",
       "      <th>size</th>\n",
       "      <th>year_filing_for</th>\n",
       "      <th>created_timestamp</th>\n",
       "      <th>change_reason</th>\n",
       "      <th>source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>215292</th>\n",
       "      <td>475886</td>\n",
       "      <td>Sion Power</td>\n",
       "      <td>Sion Power Tucson Facility</td>\n",
       "      <td>112955458.0</td>\n",
       "      <td>2900 E. Elvira Road</td>\n",
       "      <td>Tucson</td>\n",
       "      <td>AZ</td>\n",
       "      <td>85756.0</td>\n",
       "      <td>335911</td>\n",
       "      <td>Batteries, rechargeable, manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>407479</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/2/2019 19:47:19</td>\n",
       "      <td>I have recently taken over this position for m...</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>215832</th>\n",
       "      <td>476426</td>\n",
       "      <td>Cardiology of Southern California</td>\n",
       "      <td>Cardiology of Southern California</td>\n",
       "      <td>208275855.0</td>\n",
       "      <td>655 Euclid Ave, Suite 304</td>\n",
       "      <td>National City</td>\n",
       "      <td>CA</td>\n",
       "      <td>91950.0</td>\n",
       "      <td>621111</td>\n",
       "      <td>MDs' (medical doctors, except mental health) ...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>407869</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/3/2019 20:38:52</td>\n",
       "      <td>Incorrect Average number of employees, said 2 ...</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>217693</th>\n",
       "      <td>478674</td>\n",
       "      <td>Chicken Hawk Transport</td>\n",
       "      <td>Chicken Hawk Transport</td>\n",
       "      <td>820516501.0</td>\n",
       "      <td>235 London Drive</td>\n",
       "      <td>Sparks</td>\n",
       "      <td>NV</td>\n",
       "      <td>89434.0</td>\n",
       "      <td>484121</td>\n",
       "      <td>General freight trucking, long-distance, truc...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>408826</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/8/2019 18:48:31</td>\n",
       "      <td>Correction on case classification</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>221546</th>\n",
       "      <td>482527</td>\n",
       "      <td>Scougal Rubber Corporation</td>\n",
       "      <td>Scougal Rubber Corporation</td>\n",
       "      <td>900907093.0</td>\n",
       "      <td>885 Denmark Drive, Suite 103</td>\n",
       "      <td>McCarran</td>\n",
       "      <td>NV</td>\n",
       "      <td>89434.0</td>\n",
       "      <td>326291</td>\n",
       "      <td>Rubber goods, mechanical (i.e.,  extruded, la...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>78702</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/11/2019 22:43:22</td>\n",
       "      <td>Website indicates submission was necessary</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>228725</th>\n",
       "      <td>489709</td>\n",
       "      <td>NuWeld Inc.</td>\n",
       "      <td>NuWeld Inc.</td>\n",
       "      <td>232922276.0</td>\n",
       "      <td>2600 Reach Road</td>\n",
       "      <td>Williamsport</td>\n",
       "      <td>PA</td>\n",
       "      <td>17701.0</td>\n",
       "      <td>332996</td>\n",
       "      <td>Fabricated pipe and pipe fittings made from p...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>284571</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/18/2019 14:41:02</td>\n",
       "      <td>change in hours</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634472</th>\n",
       "      <td>1723592</td>\n",
       "      <td>Century Linen Service, Inc</td>\n",
       "      <td>Century Linen Service, Inc</td>\n",
       "      <td>205200067.0</td>\n",
       "      <td>542 N Perry Street</td>\n",
       "      <td>Johnstown</td>\n",
       "      <td>NY</td>\n",
       "      <td>12095.0</td>\n",
       "      <td>812331</td>\n",
       "      <td>Laundries, linen and uniform supply</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>626501</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 18:56:29</td>\n",
       "      <td>The original data was used from 2020 not 2021 ...</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634474</th>\n",
       "      <td>1723594</td>\n",
       "      <td>CENTURY LINEN&amp; UNIFORM INC</td>\n",
       "      <td>SYRACUSE</td>\n",
       "      <td>141009105.0</td>\n",
       "      <td>320 W TAYLOR STREET</td>\n",
       "      <td>SYRACUSE</td>\n",
       "      <td>NY</td>\n",
       "      <td>13507.0</td>\n",
       "      <td>812331</td>\n",
       "      <td>Laundries, linen and uniform supply</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>629942</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 18:55:20</td>\n",
       "      <td>The original data was used from 2020 not 2021 ...</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634478</th>\n",
       "      <td>1723598</td>\n",
       "      <td>APX OPERATING COMPANY LLC</td>\n",
       "      <td>Boomers Livermore</td>\n",
       "      <td>850850598.0</td>\n",
       "      <td>400 Kitty Hawk Road</td>\n",
       "      <td>Livermore</td>\n",
       "      <td>CA</td>\n",
       "      <td>94551.0</td>\n",
       "      <td>713110</td>\n",
       "      <td>713110 Amusement and Theme Parks</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>741179</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 19:11:25</td>\n",
       "      <td>missing days away from work info - corrected</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634479</th>\n",
       "      <td>1723599</td>\n",
       "      <td>APX OPERATING COMPANY LLC</td>\n",
       "      <td>Boomers Boca Raton</td>\n",
       "      <td>850850598.0</td>\n",
       "      <td>3100 Airport Road</td>\n",
       "      <td>Boca Raton</td>\n",
       "      <td>FL</td>\n",
       "      <td>33431.0</td>\n",
       "      <td>713110</td>\n",
       "      <td>713110 Amusement and Theme Parks</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>741180</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 19:24:00</td>\n",
       "      <td>missing days away from work info - corrected</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634481</th>\n",
       "      <td>1723601</td>\n",
       "      <td>APX OPERATING COMPANY LLC</td>\n",
       "      <td>Boomers Irvine</td>\n",
       "      <td>850850598.0</td>\n",
       "      <td>3405 Michelson Drive</td>\n",
       "      <td>Irvine</td>\n",
       "      <td>CA</td>\n",
       "      <td>92612.0</td>\n",
       "      <td>713110</td>\n",
       "      <td>713110 Amusement and Theme Parks</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>741182</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 19:11:50</td>\n",
       "      <td>missing days away from work info - corrected</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>24531 rows × 32 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              id                       company_name  \\\n",
       "215292    475886                         Sion Power   \n",
       "215832    476426  Cardiology of Southern California   \n",
       "217693    478674             Chicken Hawk Transport   \n",
       "221546    482527         Scougal Rubber Corporation   \n",
       "228725    489709                        NuWeld Inc.   \n",
       "...          ...                                ...   \n",
       "1634472  1723592         Century Linen Service, Inc   \n",
       "1634474  1723594         CENTURY LINEN& UNIFORM INC   \n",
       "1634478  1723598          APX OPERATING COMPANY LLC   \n",
       "1634479  1723599          APX OPERATING COMPANY LLC   \n",
       "1634481  1723601          APX OPERATING COMPANY LLC   \n",
       "\n",
       "                        establishment_name          ein  \\\n",
       "215292          Sion Power Tucson Facility  112955458.0   \n",
       "215832   Cardiology of Southern California  208275855.0   \n",
       "217693              Chicken Hawk Transport  820516501.0   \n",
       "221546          Scougal Rubber Corporation  900907093.0   \n",
       "228725                         NuWeld Inc.  232922276.0   \n",
       "...                                    ...          ...   \n",
       "1634472         Century Linen Service, Inc  205200067.0   \n",
       "1634474                           SYRACUSE  141009105.0   \n",
       "1634478                  Boomers Livermore  850850598.0   \n",
       "1634479                 Boomers Boca Raton  850850598.0   \n",
       "1634481                     Boomers Irvine  850850598.0   \n",
       "\n",
       "                       street_address           city state  zip_code  \\\n",
       "215292            2900 E. Elvira Road         Tucson    AZ   85756.0   \n",
       "215832      655 Euclid Ave, Suite 304  National City    CA   91950.0   \n",
       "217693               235 London Drive         Sparks    NV   89434.0   \n",
       "221546   885 Denmark Drive, Suite 103       McCarran    NV   89434.0   \n",
       "228725                2600 Reach Road   Williamsport    PA   17701.0   \n",
       "...                               ...            ...   ...       ...   \n",
       "1634472            542 N Perry Street      Johnstown    NY   12095.0   \n",
       "1634474           320 W TAYLOR STREET       SYRACUSE    NY   13507.0   \n",
       "1634478           400 Kitty Hawk Road      Livermore    CA   94551.0   \n",
       "1634479             3100 Airport Road     Boca Raton    FL   33431.0   \n",
       "1634481          3405 Michelson Drive         Irvine    CA   92612.0   \n",
       "\n",
       "         naics_code                               industry_description  ...  \\\n",
       "215292       335911            Batteries, rechargeable, manufacturing   ...   \n",
       "215832       621111   MDs' (medical doctors, except mental health) ...  ...   \n",
       "217693       484121   General freight trucking, long-distance, truc...  ...   \n",
       "221546       326291   Rubber goods, mechanical (i.e.,  extruded, la...  ...   \n",
       "228725       332996   Fabricated pipe and pipe fittings made from p...  ...   \n",
       "...             ...                                                ...  ...   \n",
       "1634472      812331               Laundries, linen and uniform supply   ...   \n",
       "1634474      812331               Laundries, linen and uniform supply   ...   \n",
       "1634478      713110                   713110 Amusement and Theme Parks  ...   \n",
       "1634479      713110                   713110 Amusement and Theme Parks  ...   \n",
       "1634481      713110                   713110 Amusement and Theme Parks  ...   \n",
       "\n",
       "         total_skin_disorders  total_hearing_loss  total_other_illnesses  \\\n",
       "215292                      0                   0                      0   \n",
       "215832                      0                   0                      0   \n",
       "217693                      0                   0                      0   \n",
       "221546                      0                   0                      0   \n",
       "228725                      0                   0                      0   \n",
       "...                       ...                 ...                    ...   \n",
       "1634472                     0                   0                      0   \n",
       "1634474                     0                   0                      0   \n",
       "1634478                     0                   0                      0   \n",
       "1634479                     0                   0                      1   \n",
       "1634481                     0                   0                      0   \n",
       "\n",
       "         establishment_id  establishment_type  size  year_filing_for  \\\n",
       "215292             407479                 1.0     2             2018   \n",
       "215832             407869                 1.0     2             2018   \n",
       "217693             408826                 1.0     2             2018   \n",
       "221546              78702                 1.0     2             2018   \n",
       "228725             284571                 1.0     2             2018   \n",
       "...                   ...                 ...   ...              ...   \n",
       "1634472            626501                 1.0     2             2021   \n",
       "1634474            629942                 1.0     2             2021   \n",
       "1634478            741179                 1.0     2             2021   \n",
       "1634479            741180                 1.0     2             2021   \n",
       "1634481            741182                 1.0     2             2021   \n",
       "\n",
       "          created_timestamp  \\\n",
       "215292    1/2/2019 19:47:19   \n",
       "215832    1/3/2019 20:38:52   \n",
       "217693    1/8/2019 18:48:31   \n",
       "221546   1/11/2019 22:43:22   \n",
       "228725   1/18/2019 14:41:02   \n",
       "...                     ...   \n",
       "1634472  3/12/2022 18:56:29   \n",
       "1634474  3/12/2022 18:55:20   \n",
       "1634478  3/12/2022 19:11:25   \n",
       "1634479  3/12/2022 19:24:00   \n",
       "1634481  3/12/2022 19:11:50   \n",
       "\n",
       "                                             change_reason  \\\n",
       "215292   I have recently taken over this position for m...   \n",
       "215832   Incorrect Average number of employees, said 2 ...   \n",
       "217693                   Correction on case classification   \n",
       "221546          Website indicates submission was necessary   \n",
       "228725                                     change in hours   \n",
       "...                                                    ...   \n",
       "1634472  The original data was used from 2020 not 2021 ...   \n",
       "1634474  The original data was used from 2020 not 2021 ...   \n",
       "1634478       missing days away from work info - corrected   \n",
       "1634479       missing days away from work info - corrected   \n",
       "1634481       missing days away from work info - corrected   \n",
       "\n",
       "                                                   source  \n",
       "215292                               ITA Data CY 2018.csv  \n",
       "215832                               ITA Data CY 2018.csv  \n",
       "217693                               ITA Data CY 2018.csv  \n",
       "221546                               ITA Data CY 2018.csv  \n",
       "228725                               ITA Data CY 2018.csv  \n",
       "...                                                   ...  \n",
       "1634472  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634474  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634478  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634479  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634481  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "\n",
       "[24531 rows x 32 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "042875e2-529d-4f52-9a4d-56f0b956b261",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将所有州名缩写统一成大写\n",
    "df['state'] = df['state'].str.upper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1542f6bd-2085-4b14-914a-49b5c67a52ac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>company_name</th>\n",
       "      <th>establishment_name</th>\n",
       "      <th>ein</th>\n",
       "      <th>street_address</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip_code</th>\n",
       "      <th>naics_code</th>\n",
       "      <th>industry_description</th>\n",
       "      <th>...</th>\n",
       "      <th>total_skin_disorders</th>\n",
       "      <th>total_hearing_loss</th>\n",
       "      <th>total_other_illnesses</th>\n",
       "      <th>establishment_id</th>\n",
       "      <th>establishment_type</th>\n",
       "      <th>size</th>\n",
       "      <th>year_filing_for</th>\n",
       "      <th>created_timestamp</th>\n",
       "      <th>change_reason</th>\n",
       "      <th>source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>215292</th>\n",
       "      <td>475886</td>\n",
       "      <td>Sion Power</td>\n",
       "      <td>Sion Power Tucson Facility</td>\n",
       "      <td>112955458.0</td>\n",
       "      <td>2900 E. Elvira Road</td>\n",
       "      <td>Tucson</td>\n",
       "      <td>AZ</td>\n",
       "      <td>85756.0</td>\n",
       "      <td>335911</td>\n",
       "      <td>Batteries, rechargeable, manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>407479</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/2/2019 19:47:19</td>\n",
       "      <td>I have recently taken over this position for m...</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>215832</th>\n",
       "      <td>476426</td>\n",
       "      <td>Cardiology of Southern California</td>\n",
       "      <td>Cardiology of Southern California</td>\n",
       "      <td>208275855.0</td>\n",
       "      <td>655 Euclid Ave, Suite 304</td>\n",
       "      <td>National City</td>\n",
       "      <td>CA</td>\n",
       "      <td>91950.0</td>\n",
       "      <td>621111</td>\n",
       "      <td>MDs' (medical doctors, except mental health) ...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>407869</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/3/2019 20:38:52</td>\n",
       "      <td>Incorrect Average number of employees, said 2 ...</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>217693</th>\n",
       "      <td>478674</td>\n",
       "      <td>Chicken Hawk Transport</td>\n",
       "      <td>Chicken Hawk Transport</td>\n",
       "      <td>820516501.0</td>\n",
       "      <td>235 London Drive</td>\n",
       "      <td>Sparks</td>\n",
       "      <td>NV</td>\n",
       "      <td>89434.0</td>\n",
       "      <td>484121</td>\n",
       "      <td>General freight trucking, long-distance, truc...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>408826</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/8/2019 18:48:31</td>\n",
       "      <td>Correction on case classification</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>221546</th>\n",
       "      <td>482527</td>\n",
       "      <td>Scougal Rubber Corporation</td>\n",
       "      <td>Scougal Rubber Corporation</td>\n",
       "      <td>900907093.0</td>\n",
       "      <td>885 Denmark Drive, Suite 103</td>\n",
       "      <td>McCarran</td>\n",
       "      <td>NV</td>\n",
       "      <td>89434.0</td>\n",
       "      <td>326291</td>\n",
       "      <td>Rubber goods, mechanical (i.e.,  extruded, la...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>78702</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/11/2019 22:43:22</td>\n",
       "      <td>Website indicates submission was necessary</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>228725</th>\n",
       "      <td>489709</td>\n",
       "      <td>NuWeld Inc.</td>\n",
       "      <td>NuWeld Inc.</td>\n",
       "      <td>232922276.0</td>\n",
       "      <td>2600 Reach Road</td>\n",
       "      <td>Williamsport</td>\n",
       "      <td>PA</td>\n",
       "      <td>17701.0</td>\n",
       "      <td>332996</td>\n",
       "      <td>Fabricated pipe and pipe fittings made from p...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>284571</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2018</td>\n",
       "      <td>1/18/2019 14:41:02</td>\n",
       "      <td>change in hours</td>\n",
       "      <td>ITA Data CY 2018.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634472</th>\n",
       "      <td>1723592</td>\n",
       "      <td>Century Linen Service, Inc</td>\n",
       "      <td>Century Linen Service, Inc</td>\n",
       "      <td>205200067.0</td>\n",
       "      <td>542 N Perry Street</td>\n",
       "      <td>Johnstown</td>\n",
       "      <td>NY</td>\n",
       "      <td>12095.0</td>\n",
       "      <td>812331</td>\n",
       "      <td>Laundries, linen and uniform supply</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>626501</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 18:56:29</td>\n",
       "      <td>The original data was used from 2020 not 2021 ...</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634474</th>\n",
       "      <td>1723594</td>\n",
       "      <td>CENTURY LINEN&amp; UNIFORM INC</td>\n",
       "      <td>SYRACUSE</td>\n",
       "      <td>141009105.0</td>\n",
       "      <td>320 W TAYLOR STREET</td>\n",
       "      <td>SYRACUSE</td>\n",
       "      <td>NY</td>\n",
       "      <td>13507.0</td>\n",
       "      <td>812331</td>\n",
       "      <td>Laundries, linen and uniform supply</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>629942</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 18:55:20</td>\n",
       "      <td>The original data was used from 2020 not 2021 ...</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634478</th>\n",
       "      <td>1723598</td>\n",
       "      <td>APX OPERATING COMPANY LLC</td>\n",
       "      <td>Boomers Livermore</td>\n",
       "      <td>850850598.0</td>\n",
       "      <td>400 Kitty Hawk Road</td>\n",
       "      <td>Livermore</td>\n",
       "      <td>CA</td>\n",
       "      <td>94551.0</td>\n",
       "      <td>713110</td>\n",
       "      <td>713110 Amusement and Theme Parks</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>741179</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 19:11:25</td>\n",
       "      <td>missing days away from work info - corrected</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634479</th>\n",
       "      <td>1723599</td>\n",
       "      <td>APX OPERATING COMPANY LLC</td>\n",
       "      <td>Boomers Boca Raton</td>\n",
       "      <td>850850598.0</td>\n",
       "      <td>3100 Airport Road</td>\n",
       "      <td>Boca Raton</td>\n",
       "      <td>FL</td>\n",
       "      <td>33431.0</td>\n",
       "      <td>713110</td>\n",
       "      <td>713110 Amusement and Theme Parks</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>741180</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 19:24:00</td>\n",
       "      <td>missing days away from work info - corrected</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1634481</th>\n",
       "      <td>1723601</td>\n",
       "      <td>APX OPERATING COMPANY LLC</td>\n",
       "      <td>Boomers Irvine</td>\n",
       "      <td>850850598.0</td>\n",
       "      <td>3405 Michelson Drive</td>\n",
       "      <td>Irvine</td>\n",
       "      <td>CA</td>\n",
       "      <td>92612.0</td>\n",
       "      <td>713110</td>\n",
       "      <td>713110 Amusement and Theme Parks</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>741182</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/12/2022 19:11:50</td>\n",
       "      <td>missing days away from work info - corrected</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>24531 rows × 32 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              id                       company_name  \\\n",
       "215292    475886                         Sion Power   \n",
       "215832    476426  Cardiology of Southern California   \n",
       "217693    478674             Chicken Hawk Transport   \n",
       "221546    482527         Scougal Rubber Corporation   \n",
       "228725    489709                        NuWeld Inc.   \n",
       "...          ...                                ...   \n",
       "1634472  1723592         Century Linen Service, Inc   \n",
       "1634474  1723594         CENTURY LINEN& UNIFORM INC   \n",
       "1634478  1723598          APX OPERATING COMPANY LLC   \n",
       "1634479  1723599          APX OPERATING COMPANY LLC   \n",
       "1634481  1723601          APX OPERATING COMPANY LLC   \n",
       "\n",
       "                        establishment_name          ein  \\\n",
       "215292          Sion Power Tucson Facility  112955458.0   \n",
       "215832   Cardiology of Southern California  208275855.0   \n",
       "217693              Chicken Hawk Transport  820516501.0   \n",
       "221546          Scougal Rubber Corporation  900907093.0   \n",
       "228725                         NuWeld Inc.  232922276.0   \n",
       "...                                    ...          ...   \n",
       "1634472         Century Linen Service, Inc  205200067.0   \n",
       "1634474                           SYRACUSE  141009105.0   \n",
       "1634478                  Boomers Livermore  850850598.0   \n",
       "1634479                 Boomers Boca Raton  850850598.0   \n",
       "1634481                     Boomers Irvine  850850598.0   \n",
       "\n",
       "                       street_address           city state  zip_code  \\\n",
       "215292            2900 E. Elvira Road         Tucson    AZ   85756.0   \n",
       "215832      655 Euclid Ave, Suite 304  National City    CA   91950.0   \n",
       "217693               235 London Drive         Sparks    NV   89434.0   \n",
       "221546   885 Denmark Drive, Suite 103       McCarran    NV   89434.0   \n",
       "228725                2600 Reach Road   Williamsport    PA   17701.0   \n",
       "...                               ...            ...   ...       ...   \n",
       "1634472            542 N Perry Street      Johnstown    NY   12095.0   \n",
       "1634474           320 W TAYLOR STREET       SYRACUSE    NY   13507.0   \n",
       "1634478           400 Kitty Hawk Road      Livermore    CA   94551.0   \n",
       "1634479             3100 Airport Road     Boca Raton    FL   33431.0   \n",
       "1634481          3405 Michelson Drive         Irvine    CA   92612.0   \n",
       "\n",
       "         naics_code                               industry_description  ...  \\\n",
       "215292       335911            Batteries, rechargeable, manufacturing   ...   \n",
       "215832       621111   MDs' (medical doctors, except mental health) ...  ...   \n",
       "217693       484121   General freight trucking, long-distance, truc...  ...   \n",
       "221546       326291   Rubber goods, mechanical (i.e.,  extruded, la...  ...   \n",
       "228725       332996   Fabricated pipe and pipe fittings made from p...  ...   \n",
       "...             ...                                                ...  ...   \n",
       "1634472      812331               Laundries, linen and uniform supply   ...   \n",
       "1634474      812331               Laundries, linen and uniform supply   ...   \n",
       "1634478      713110                   713110 Amusement and Theme Parks  ...   \n",
       "1634479      713110                   713110 Amusement and Theme Parks  ...   \n",
       "1634481      713110                   713110 Amusement and Theme Parks  ...   \n",
       "\n",
       "         total_skin_disorders  total_hearing_loss  total_other_illnesses  \\\n",
       "215292                      0                   0                      0   \n",
       "215832                      0                   0                      0   \n",
       "217693                      0                   0                      0   \n",
       "221546                      0                   0                      0   \n",
       "228725                      0                   0                      0   \n",
       "...                       ...                 ...                    ...   \n",
       "1634472                     0                   0                      0   \n",
       "1634474                     0                   0                      0   \n",
       "1634478                     0                   0                      0   \n",
       "1634479                     0                   0                      1   \n",
       "1634481                     0                   0                      0   \n",
       "\n",
       "         establishment_id  establishment_type  size  year_filing_for  \\\n",
       "215292             407479                 1.0     2             2018   \n",
       "215832             407869                 1.0     2             2018   \n",
       "217693             408826                 1.0     2             2018   \n",
       "221546              78702                 1.0     2             2018   \n",
       "228725             284571                 1.0     2             2018   \n",
       "...                   ...                 ...   ...              ...   \n",
       "1634472            626501                 1.0     2             2021   \n",
       "1634474            629942                 1.0     2             2021   \n",
       "1634478            741179                 1.0     2             2021   \n",
       "1634479            741180                 1.0     2             2021   \n",
       "1634481            741182                 1.0     2             2021   \n",
       "\n",
       "          created_timestamp  \\\n",
       "215292    1/2/2019 19:47:19   \n",
       "215832    1/3/2019 20:38:52   \n",
       "217693    1/8/2019 18:48:31   \n",
       "221546   1/11/2019 22:43:22   \n",
       "228725   1/18/2019 14:41:02   \n",
       "...                     ...   \n",
       "1634472  3/12/2022 18:56:29   \n",
       "1634474  3/12/2022 18:55:20   \n",
       "1634478  3/12/2022 19:11:25   \n",
       "1634479  3/12/2022 19:24:00   \n",
       "1634481  3/12/2022 19:11:50   \n",
       "\n",
       "                                             change_reason  \\\n",
       "215292   I have recently taken over this position for m...   \n",
       "215832   Incorrect Average number of employees, said 2 ...   \n",
       "217693                   Correction on case classification   \n",
       "221546          Website indicates submission was necessary   \n",
       "228725                                     change in hours   \n",
       "...                                                    ...   \n",
       "1634472  The original data was used from 2020 not 2021 ...   \n",
       "1634474  The original data was used from 2020 not 2021 ...   \n",
       "1634478       missing days away from work info - corrected   \n",
       "1634479       missing days away from work info - corrected   \n",
       "1634481       missing days away from work info - corrected   \n",
       "\n",
       "                                                   source  \n",
       "215292                               ITA Data CY 2018.csv  \n",
       "215832                               ITA Data CY 2018.csv  \n",
       "217693                               ITA Data CY 2018.csv  \n",
       "221546                               ITA Data CY 2018.csv  \n",
       "228725                               ITA Data CY 2018.csv  \n",
       "...                                                   ...  \n",
       "1634472  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634474  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634478  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634479  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "1634481  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv  \n",
       "\n",
       "[24531 rows x 32 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2a1bc77b-baab-49cf-a4f1-865f79754652",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 重新存储写入csv文件并指定分隔符\n",
    "df.to_csv('ITA_OSHA_Combined_New.csv', sep='|', index=False, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fa984b69-64a5-4450-a589-d22004fa4885",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"/Users/wenzel/Downloads/ITA_OSHA_Combined.csv\", low_memory=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1f98101f-92cf-49ea-9f20-4ef4c60cddaf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>company_name</th>\n",
       "      <th>establishment_name</th>\n",
       "      <th>ein</th>\n",
       "      <th>street_address</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip_code</th>\n",
       "      <th>naics_code</th>\n",
       "      <th>industry_description</th>\n",
       "      <th>...</th>\n",
       "      <th>total_hearing_loss</th>\n",
       "      <th>total_other_illnesses</th>\n",
       "      <th>establishment_id</th>\n",
       "      <th>establishment_type</th>\n",
       "      <th>size</th>\n",
       "      <th>year_filing_for</th>\n",
       "      <th>created_timestamp</th>\n",
       "      <th>change_reason</th>\n",
       "      <th>source</th>\n",
       "      <th>delete</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>McKamish Inc.</td>\n",
       "      <td>McKamish Inc.</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50 55th Street</td>\n",
       "      <td>Pittsburgh</td>\n",
       "      <td>PA</td>\n",
       "      <td>15201.0</td>\n",
       "      <td>238220</td>\n",
       "      <td>Heating, ventilation and air-conditioning (HVA...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>41920</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2016</td>\n",
       "      <td>8/1/2017 6:12:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2016.csv</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>The Talaria Company LLC</td>\n",
       "      <td>The Hinckley Company</td>\n",
       "      <td>NaN</td>\n",
       "      <td>40 Industrial Way</td>\n",
       "      <td>Trenton</td>\n",
       "      <td>ME</td>\n",
       "      <td>4605.0</td>\n",
       "      <td>336612</td>\n",
       "      <td>Pleasure boats manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>41922</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2016</td>\n",
       "      <td>8/1/2017 6:23:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2016.csv</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6</td>\n",
       "      <td>Williamsburg Manufacturing</td>\n",
       "      <td>Williamsburg Manufacturing</td>\n",
       "      <td>NaN</td>\n",
       "      <td>408 Maplewood Ave</td>\n",
       "      <td>Williamsburg</td>\n",
       "      <td>IA</td>\n",
       "      <td>52361.0</td>\n",
       "      <td>336370</td>\n",
       "      <td>Motor vehicle metal parts stamping</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>41923</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2016</td>\n",
       "      <td>8/1/2017 6:27:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2016.csv</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7</td>\n",
       "      <td>The Talaria Company LLC</td>\n",
       "      <td>Morris Yachts LLC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27 Ramp Road</td>\n",
       "      <td>Trenton</td>\n",
       "      <td>ME</td>\n",
       "      <td>4605.0</td>\n",
       "      <td>336612</td>\n",
       "      <td>Pleasure boats manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>41925</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2016</td>\n",
       "      <td>8/1/2017 6:36:00</td>\n",
       "      <td>Hit wrong hyperlink</td>\n",
       "      <td>ITA Data CY 2016.csv</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8</td>\n",
       "      <td>The Talaria Company LLC</td>\n",
       "      <td>Hunt Yachts LLC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1909 Alden Landing</td>\n",
       "      <td>Portsmouth</td>\n",
       "      <td>RI</td>\n",
       "      <td>2871.0</td>\n",
       "      <td>336612</td>\n",
       "      <td>Pleasure boats manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>41926</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2016</td>\n",
       "      <td>8/1/2017 6:35:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2016.csv</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1635159</th>\n",
       "      <td>1724282</td>\n",
       "      <td>Thombert Inc</td>\n",
       "      <td>Thombert</td>\n",
       "      <td>420670188.0</td>\n",
       "      <td>220 Industrial Drive</td>\n",
       "      <td>Brooklyn</td>\n",
       "      <td>IA</td>\n",
       "      <td>52211.0</td>\n",
       "      <td>325211</td>\n",
       "      <td>Acetal resins manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>893662</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/14/2022 21:39:34</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1635160</th>\n",
       "      <td>1724283</td>\n",
       "      <td>ADF International</td>\n",
       "      <td>ADF International</td>\n",
       "      <td>650370294.0</td>\n",
       "      <td>1900 Great Bear Ave.</td>\n",
       "      <td>Great Falls</td>\n",
       "      <td>MT</td>\n",
       "      <td>59404.0</td>\n",
       "      <td>332312</td>\n",
       "      <td>Structural steel, fabricated, manufacturing</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>893661</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/14/2022 21:40:52</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1635161</th>\n",
       "      <td>1724284</td>\n",
       "      <td>TDK Corporation of America</td>\n",
       "      <td>TCA - San Jose</td>\n",
       "      <td>952893461.0</td>\n",
       "      <td>1745 Technology Drive, Suite 200</td>\n",
       "      <td>San Jose</td>\n",
       "      <td>CA</td>\n",
       "      <td>95110.0</td>\n",
       "      <td>423690</td>\n",
       "      <td>Capacitors, electronic, merchant wholesalers</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>502665</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/14/2022 21:58:15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1635162</th>\n",
       "      <td>1724285</td>\n",
       "      <td>TDK Corporation of America</td>\n",
       "      <td>TCA Lincolnshire</td>\n",
       "      <td>952893461.0</td>\n",
       "      <td>475 Half Day Road</td>\n",
       "      <td>Lincolnshire</td>\n",
       "      <td>IL</td>\n",
       "      <td>60069.0</td>\n",
       "      <td>423690</td>\n",
       "      <td>Capacitors, electronic, merchant wholesalers</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>502663</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/14/2022 21:58:15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1635163</th>\n",
       "      <td>1724286</td>\n",
       "      <td>Morello Concrete Construction Inc.</td>\n",
       "      <td>Morello Concrete Construction Inc.</td>\n",
       "      <td>330449044.0</td>\n",
       "      <td>8534 Hubbles Lane</td>\n",
       "      <td>Santee</td>\n",
       "      <td>CA</td>\n",
       "      <td>92071.0</td>\n",
       "      <td>238110</td>\n",
       "      <td>Foundation, building, poured concrete, contra...</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>427167</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2021</td>\n",
       "      <td>3/14/2022 22:01:33</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ITA Data CY 2021 submitted thru 3-14-2022 v3.csv</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1635164 rows × 33 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              id                        company_name  \\\n",
       "0              4                       McKamish Inc.   \n",
       "1              5             The Talaria Company LLC   \n",
       "2              6          Williamsburg Manufacturing   \n",
       "3              7             The Talaria Company LLC   \n",
       "4              8             The Talaria Company LLC   \n",
       "...          ...                                 ...   \n",
       "1635159  1724282                        Thombert Inc   \n",
       "1635160  1724283                   ADF International   \n",
       "1635161  1724284          TDK Corporation of America   \n",
       "1635162  1724285          TDK Corporation of America   \n",
       "1635163  1724286  Morello Concrete Construction Inc.   \n",
       "\n",
       "                         establishment_name          ein  \\\n",
       "0                             McKamish Inc.          NaN   \n",
       "1                      The Hinckley Company          NaN   \n",
       "2                Williamsburg Manufacturing          NaN   \n",
       "3                         Morris Yachts LLC          NaN   \n",
       "4                           Hunt Yachts LLC          NaN   \n",
       "...                                     ...          ...   \n",
       "1635159                            Thombert  420670188.0   \n",
       "1635160                   ADF International  650370294.0   \n",
       "1635161                      TCA - San Jose  952893461.0   \n",
       "1635162                    TCA Lincolnshire  952893461.0   \n",
       "1635163  Morello Concrete Construction Inc.  330449044.0   \n",
       "\n",
       "                           street_address          city state  zip_code  \\\n",
       "0                          50 55th Street    Pittsburgh    PA   15201.0   \n",
       "1                       40 Industrial Way       Trenton    ME    4605.0   \n",
       "2                       408 Maplewood Ave  Williamsburg    IA   52361.0   \n",
       "3                            27 Ramp Road       Trenton    ME    4605.0   \n",
       "4                      1909 Alden Landing    Portsmouth    RI    2871.0   \n",
       "...                                   ...           ...   ...       ...   \n",
       "1635159              220 Industrial Drive      Brooklyn    IA   52211.0   \n",
       "1635160              1900 Great Bear Ave.   Great Falls    MT   59404.0   \n",
       "1635161  1745 Technology Drive, Suite 200      San Jose    CA   95110.0   \n",
       "1635162                 475 Half Day Road  Lincolnshire    IL   60069.0   \n",
       "1635163                 8534 Hubbles Lane        Santee    CA   92071.0   \n",
       "\n",
       "         naics_code                               industry_description  ...  \\\n",
       "0            238220  Heating, ventilation and air-conditioning (HVA...  ...   \n",
       "1            336612                       Pleasure boats manufacturing  ...   \n",
       "2            336370                 Motor vehicle metal parts stamping  ...   \n",
       "3            336612                       Pleasure boats manufacturing  ...   \n",
       "4            336612                       Pleasure boats manufacturing  ...   \n",
       "...             ...                                                ...  ...   \n",
       "1635159      325211                       Acetal resins manufacturing   ...   \n",
       "1635160      332312       Structural steel, fabricated, manufacturing   ...   \n",
       "1635161      423690      Capacitors, electronic, merchant wholesalers   ...   \n",
       "1635162      423690      Capacitors, electronic, merchant wholesalers   ...   \n",
       "1635163      238110   Foundation, building, poured concrete, contra...  ...   \n",
       "\n",
       "         total_hearing_loss  total_other_illnesses  establishment_id  \\\n",
       "0                         0                      0             41920   \n",
       "1                         0                      0             41922   \n",
       "2                         0                      0             41923   \n",
       "3                         0                      0             41925   \n",
       "4                         0                      0             41926   \n",
       "...                     ...                    ...               ...   \n",
       "1635159                   0                      0            893662   \n",
       "1635160                   0                      0            893661   \n",
       "1635161                   0                      0            502665   \n",
       "1635162                   0                      0            502663   \n",
       "1635163                   0                      0            427167   \n",
       "\n",
       "         establishment_type  size  year_filing_for   created_timestamp  \\\n",
       "0                       1.0     3             2016    8/1/2017 6:12:00   \n",
       "1                       1.0     3             2016    8/1/2017 6:23:00   \n",
       "2                       1.0     3             2016    8/1/2017 6:27:00   \n",
       "3                       1.0     2             2016    8/1/2017 6:36:00   \n",
       "4                       1.0     2             2016    8/1/2017 6:35:00   \n",
       "...                     ...   ...              ...                 ...   \n",
       "1635159                 1.0     2             2021  3/14/2022 21:39:34   \n",
       "1635160                 1.0     2             2021  3/14/2022 21:40:52   \n",
       "1635161                 1.0     2             2021  3/14/2022 21:58:15   \n",
       "1635162                 1.0     2             2021  3/14/2022 21:58:15   \n",
       "1635163                 1.0     2             2021  3/14/2022 22:01:33   \n",
       "\n",
       "               change_reason  \\\n",
       "0                        NaN   \n",
       "1                        NaN   \n",
       "2                        NaN   \n",
       "3        Hit wrong hyperlink   \n",
       "4                        NaN   \n",
       "...                      ...   \n",
       "1635159                  NaN   \n",
       "1635160                  NaN   \n",
       "1635161                  NaN   \n",
       "1635162                  NaN   \n",
       "1635163                  NaN   \n",
       "\n",
       "                                                   source  delete  \n",
       "0                                    ITA Data CY 2016.csv     NaN  \n",
       "1                                    ITA Data CY 2016.csv     NaN  \n",
       "2                                    ITA Data CY 2016.csv     NaN  \n",
       "3                                    ITA Data CY 2016.csv     NaN  \n",
       "4                                    ITA Data CY 2016.csv     NaN  \n",
       "...                                                   ...     ...  \n",
       "1635159  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv     0.0  \n",
       "1635160  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv     0.0  \n",
       "1635161  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv     0.0  \n",
       "1635162  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv     0.0  \n",
       "1635163  ITA Data CY 2021 submitted thru 3-14-2022 v3.csv     0.0  \n",
       "\n",
       "[1635164 rows x 33 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
